clear all
set more off
set maxvar 32767
//set max_memory 16g, once
ssc install nearstat
ssc install carryforward

local main "E:\Florida Wind Patterns\SFC data"
cd "`main'"

// year
forval yy = 2003/2012 {
****************
****************
scalar prefdist = 5 // specify max distance between EPA site and wind monitors (any site further than this value is excluded)
****************	
****************

***************************************************************
// Identify and drop all monitors with >5% zeros IN THE HOURLY FILE winddir_lax.dta
// winddir_lax.dta is the cleaned wind station dataset (end result of STEP 1 in 
// wind_pollution_..._5mi_lax.do, around line 386)
***************************************************************
use "`main'\winddir_lax_`yy'.dta", clear
keep if year == `yy'
drop *mode*
gen tag0w = (windvalue==0)
levelsof station, loc(wid) clean
foreach cc of local wid {
	qui sum tag0w if tag0w==1 & station=="`cc'"
	scalar n0 = r(N)
	qui sum windvalue if station=="`cc'"
	scalar nwind = r(N)
	scalar pct0w = n0/nwind*100
	if pct0w>5 drop if station=="`cc'"

}

save "`main'\winddir_dropmonitor_`yy'.dta", replace
tempfile winddir
save `winddir', replace

***************************************************************
// Calculate distances between EPA and MADIS sites
***************************************************************
use "`main'\EPAlonlat_FL_`yy'.dta", clear
levelsof sitecnty, loc(epasite)
rename sitecnty EPAsite
tempfile epa
save `epa', replace

// Get wind site longitude latitude, then merge with epa 
use `winddir', clear
levelsof station, loc(stationID) clean
scalar nwind = wordcount("`stationID'") // count number of stations
keep station lat lon
drop if mi(lat)
duplicates drop station, force // some stations (esp on water) change lat-lon combination often, this keeps just one combo for each station
drop if station==""
gen id = _n
rename (lat lon) (lat_ lon_)
reshape wide lat_ lon_, i(id) j(station) string
carryforward *, replace
keep if id ==_N
replace id = 1 

merge 1:m id using `epa', nogen
drop id
order EPA*

di "********************* CALCULATE DISTANCES START *****************************"
// calculate distance between each EPA site and each wind site
foreach stid of local stationID {
	nearstat EPAlat_ EPAlon_, near( lat_`stid' lon_`stid') distvar(dist_`stid') r(3958.761) replace
}

di "********************* CALCULATE DISTANCES END *****************************"


di "********************* ORDER BY DISTANCE *****************************"

rowsort dist_*, gen(dist1-dist`=nwind') // sort distances in ascending order

di "********************* IDENTIFY STATIONS WITHIN X MILES*****************************"

// identify closest wind stations within `prefdist' miles of pollution site
forval vv = 1/`=nwind' {
replace dist`vv' = . if dist`vv'>prefdist // exclude wind sites that are further than "prefdist" miles, prefdist specified at the very top 
gen windsite`vv' = ""
gen windlat`vv' = .
gen windlon`vv' = .
foreach ww of local stationID {
	replace windsite`vv' = "`ww'" if abs(dist`vv' - dist_`ww')<0.0001
	replace windlat`vv' = lat_`ww' if abs(dist`vv' - dist_`ww')<0.0001
	replace windlon`vv' = lon_`ww' if abs(dist`vv' - dist_`ww')<0.0001
}
}

dropmiss *, force // drops distance variables that have all missing observations
drop dist_* lat_* lon_*
rename (EPAlat_ EPAlon_) (EPAlat EPAlon)

di "********************* SAVE TEMPFILE NEAREST*****************************"

tempfile nearest
save `nearest', replace
save "`main'\nearest_dropmonitor.dta", replace


di "********************* LOAD POLLUTION DATA *****************************"

use "`main'\hourly_EPApol", clear

di "********************* POLLUTION DATA LOADED *****************************"

rename param pollutant
// fix the datetime variable (i had coded it up weirdly the first time, just fixing that)
gen dtemp = datetime + 2*60*1000	// add 2 minutes since the minutes are either 0 1 or 58 59
gen dd = dofc(dtemp)
gen hh = hh(dtemp)
gen tt = hms(hh, 00, 00)
format tt %tcHH:MM
drop datetime
gen double datetime = dd*24*60*60*1000 + tt
format datetime %tcNN/DD/CCYY_HH:MM
gen year = year(dofc(datetime))

// keep only year specified above
keep if year == `yy'
// keep only between 8am and 3pm
keep if hh >=8 & hh<=15

drop dtemp dd hh tt

tostring sitenum, replace
gen sitecnty = county + " " + sitenum
rename (sitecnty latitude longitude value units) (EPAsite EPAlat EPAlon EPAvalue EPAunits)
keep EPAsite EPAl* pollutant EPAunits datetime EPAvalue year
merge m:1 EPAsite using `nearest', nogen
drop if pollutant=="" // will drop EPA sites that are actually not available during a particular year
dropmiss *, force
gen month = month(dofc(datetime))
gen day = day(dofc(datetime))
gen hour = hh(datetime)

gen namepol = "PM25" if strmatch(pollutant, "*PM2.5*AQI*")
replace namepol = "PM25_BC" if strmatch(pollutant, "Black*PM2.5*")
replace namepol = "PM25_loc" if strmatch(pollutant, "*PM2.5*Local*")
replace namepol = "PM10" if strmatch(pollutant, "*PM10*Total*")
replace namepol = "CO" if strmatch(pollutant, "*Carbon*mono*")
replace namepol = "NO" if strmatch(pollutant, "*Nitric*")
replace namepol = "NO2" if strmatch(pollutant, "*Nitrogen*di*")
replace namepol = "NOx" if strmatch(pollutant, "*Oxides*of*")
replace namepol = "NOy" if strmatch(pollutant, "*Reactive*nitrogen*")
replace namepol = "SO2" if strmatch(pollutant, "*Sulfur*")
replace namepol = "Ozone" if pollutant=="Ozone"
rename pollutant pollutantold
rename namepol pollutant 

// save a dataset with EPA data and nearest wind sites (<`prefdist' miles) associated with it
save `nearest', replace
save "`main'\nearest_dropmonitor.dta", replace // use for troubleshooting
ds windsite*
scalar nwindsite = wordcount("`r(varlist)'") // max number of windsites within `prefdist' miles

di "********************* STEP 3 *****************************"

*******************************************************************************
// STEP 3. Merge with wind direction data
// Since each EPA site has at least one wind monitor associated with it, 
// do the merging in layers (first merge based on the first closest, then 
// second closest etc)
*******************************************************************************

forval vv = 1/`=nwindsite' { // if we're only interested in the closest monitor use vv=1/1, otherwise use `=nwindsite' {
	if `vv' == 1 local ind "1st"
	if `vv' == 2 local ind "2nd"
	if `vv' == 3 local ind "3rd"
	if `vv'>3 local ind "`vv'th"
	
	use `winddir', clear
	preserve
	gen windsite`vv' = station
	merge 1:m windsite`vv' year month day hour using `nearest', nogen
	// fix datetime to have minute be 00 (i did this when filling in in STEP 1 but just in case)
	gen dd = dofc(datetime)
	gen tt = hms(hour, 00,00)
	drop datetime
	gen double datetime = dd*24*60*60*1000 + tt
	format datetime %tcNN/DD/CCYY_HH:MM
	drop dd tt
	
	keep datetime EPAsite pollutant EPAunits EPAvalue EPAl* windsite`vv' *circmean* *windvalue* dist`vv' windlat`vv' windlon`vv' windvar windunits  minute pollutantold filled filled_day
	order datetime EPAsite pollutant EPAunits EPAvalue windsite`vv' *circmean* *windvalue* dist`vv' EPAl* windlat`vv' windlon`vv' windvar windunits
	sort EPAsite pollutant datetime
	rename (*windvalue* *circmean* minute) (*windvalue*`vv' *circmean*`vv' minute`vv')
	
	la var windvalue`vv' "Wind direction, `ind' closest, degrees north [MADIS SFC]"	
	la var im_windvalue`vv' "Filled in, wind direction, `ind' closest, degrees north [MADIS SFC]"
	la var im_windvalue_1dayout`vv' "Filled in 1 day out, wind direction, `ind' closest, degrees north [MADIS SFC]"
	la var im_windvalue_3dayout`vv' "Filled in 3 days out, wind direction, `ind' closest, degrees north [MADIS SFC]"
	la var circmean`vv' "Wind direction, `ind' closest, circular mean"
	la var im_circmean`vv' "Filled in, wind direction, `ind' closest, circular mean"
	la var im_circmean_1dayout`vv' "Filled in 1 day out, wind direction, `ind' closest, circular mean"
	la var im_circmean_3dayout`vv' "Filled in 3 day out, wind direction, `ind' closest, circular mean"
	la var minute`vv' "Minute of reading of windvalue`vv' [MADIS SFC]"
	la var windsite`vv' "`ind' closest wind monitor"
	la var dist`vv' "Distance (mi) from `ind' closest wind monitor"
	la var windlat`vv' "Latitude of `ind' closest wind monitor"
	la var windlon`vv' "Longitude of `ind' closest wind monitor"
	
	drop if mi(EPAsite)
	duplicates drop  
	duplicates tag  datetime EPAsite pollutant, gen(dups)
	drop if dups==1 & EPAvalue==0
	sort datetime EPAsite pollutant 
	egen dups1 = seq(), by(datetime EPAsite pollutant) 
	drop if dups1==2
	drop dup*
	tempfile merged`vv'
	save `merged`vv'', replace
	restore
	
	
}
	
use `merged1', clear
// uncomment next 3 lines if interested in 2nd, 3rd etc closest monitor
forval vv = 2/`=nwindsite' {
	duplicates drop  
	duplicates tag  datetime EPAsite pollutant, gen(dups)
	drop if dups==1 & EPAvalue==0
	sort datetime EPAsite pollutant 
	egen dups1 = seq(), by(datetime EPAsite pollutant) 
	drop if dups1==2
	drop dup*
	merge 1:1 datetime EPAsite pollutant using `merged`vv'', nogen
}

sort EPAsite pollutant datetime
cd "`main'"
save wnpFL`yy'_`=prefdist'mi_lax_dropmonitor, replace
}
